# written by Martin Schwentner 06/29/2016
# retains only sequences of Branchiopoda in each file (removes all others), written to new files, second part to remove space after > in the output!

import fileinput, glob, string, sys, os, re

from Bio import SeqIO

#species = ("Artemia_", "Branchin", "BranchLi", "Streptoc", "Thamnoce", "DaphniaP", "Evadne_c", "Podon_le", "Anchistr", "Bosmina_", "Eurycerc", "Scaphole", "Ceriodap", "Simoceph", "Moina_sp", "Sida_cry", "Macrothr", "Polyphem", "145Cycle", "146Cycle", "Lynceus_", "Lepiduru", "Triops_a", "TriopsGr", "TriopsLo", "Cyzicus_", "Metalimn", "Paralimn", "Eocyzicu", "OzestheR", "Ozesther", "Eoleptes", "Eulimnad", "EulTexan", "LimnadBi", "LimnadPa")

my_files = glob.glob("*renamed.fa") # the files that will be opened


for filename in my_files:
    out_file = open(filename + '_BranchOnly.fa', 'w+')       #the files where the output will be saved to


    for record in SeqIO.parse(filename, "fasta"):   #reads file as fasta file
#        for each_species in species:
#        print record
        if "Artemia_" in record.id or "Branchin" in record.id or "BranchLi" in record.id or "Streptoc" in record.id or "Thamnoce" in record.id or "DaphniaP" in record.id or "Evadne_c" in record.id or "Podon_le" in record.id or "Anchistr" in record.id or "Bosmina_" in record.id or "Eurycerc" in record.id or "Scaphole" in record.id or "Ceriodap" in record.id or "Simoceph" in record.id or "Moina_sp" in record.id or "Sida_cry" in record.id or "Macrothr" in record.id or "Polyphem" in record.id or "145Cycle" in record.id or "146Cycle" in record.id or "Lynceus_" in record.id or "Lepiduru" in record.id or "Triops_a" in record.id or "TriopsGr" in record.id or "TriopsLo" in record.id or "Cyzicus_" in record.id or "Metalimn" in record.id or "Paralimn" in record.id or "Eocyzicu" in record.id or "OzestheR" in record.id or "Ozesther" in record.id or "Eoleptes" in record.id or "Eulimnad" in record.id or "EulTexan" in record.id or "LimnadBi" in record.id or "LimnadPa" in record.id:
#        if "Artemia_" or "Branchin" in record.id:            
            print >> out_file, ">", record.id
            print >> out_file, record.seq

out_file.close()

my_files_2 = glob.glob("*_BranchOnly.fa")
for filename2 in my_files_2:
    in_file_2 = open(filename2)
    out_file_2 = open(filename2 + '_edit.fa', 'w+')
#    filename3 = filename2.readlines()	
    for lines2 in in_file_2:
        if lines2.startswith('>'):
            lines2 = lines2.replace('> ', '>')      #here the extra space is removed
        out_file_2.writelines(lines2)